from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
!pip install nbconvert
Requirement already satisfied: nbconvert in /usr/local/lib/python3.10/dist-packages (6.5.4) Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.9.4) Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.12.3) Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from nbconvert) (6.1.0) Requirement already satisfied: defusedxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.7.1) Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.4) Requirement already satisfied: jinja2>=3.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (3.1.4) Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.2) Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.3.0) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.1.5) Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.8.4) Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.10.0) Requirement already satisfied: nbformat>=5.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.10.4) Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from nbconvert) (24.1) Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.5.1) Requirement already satisfied: pygments>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.16.1) Requirement already satisfied: tinycss2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.3.0) Requirement already satisfied: traitlets>=5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.1) Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core>=4.7->nbconvert) (4.3.2) Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.10/dist-packages (from nbclient>=0.5.0->nbconvert) (6.1.12) Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (2.20.0) Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (4.23.0) Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->nbconvert) (2.6) Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (1.16.0) Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (0.5.1) Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (24.2.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (2023.12.1) Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.35.1) Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.20.0) Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (24.0.1) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.8.2) Requirement already satisfied: tornado>=4.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.3.3)
! jupyter nbconvert --to html "/content/Menna_MOFA_renal_carcinoma.ipynb"
[NbConvertApp] Converting notebook /content/Menna_MOFA_renal_carcinoma.ipynb to html [NbConvertApp] Writing 4229549 bytes to /content/Menna_MOFA_renal_carcinoma.html
install.packages("reticulate")
#! pip install mofapy2 #note:this is a python code
#Import the mofapy2 package
#import mofapy2 as mof
reticulate::py_run_string("import mofapy2")
Installing package into ‘/usr/local/lib/R/site-library’ (as ‘lib’ is unspecified) also installing the dependencies ‘RcppTOML’, ‘here’, ‘png’
library(BiocManager, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
#loading libraries
library(MOFA2, lib="/content/drive/MyDrive/Colab Notebooks/R_lib")
library(MOFAdata, lib="/content/drive/MyDrive/Colab Notebooks/R_lib")
library(data.table, lib="/content/drive/MyDrive/Colab Notebooks/R_lib")
library(ggplot2, lib="/content/drive/MyDrive/Colab Notebooks/R_lib")
library(tidyverse, lib="/content/drive/MyDrive/Colab Notebooks/R_lib")
library(dplyr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
Attaching package: ‘MOFA2’
The following object is masked from ‘package:stats’:
predict
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between() masks data.table::between()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::first() masks data.table::first()
✖ lubridate::hour() masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::last() masks data.table::last()
✖ lubridate::mday() masks data.table::mday()
✖ lubridate::minute() masks data.table::minute()
✖ lubridate::month() masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second() masks data.table::second()
✖ purrr::transpose() masks data.table::transpose()
✖ lubridate::wday() masks data.table::wday()
✖ lubridate::week() masks data.table::week()
✖ lubridate::yday() masks data.table::yday()
✖ lubridate::year() masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mRNA = read.csv("/content/drive/MyDrive/last_processing/df_mrna.csv", row.names = 1)
miRNA = read.csv("/content/drive/MyDrive/last_processing/mirna.csv", row.names = 1)
proteo = read.csv("/content/drive/MyDrive/last_processing/proteo.csv", row.names = 1)
head(miRNA, 1)
head(mRNA, 1)
head(proteo, 1)
| n_C3N.01524 | n_C3L.00581 | n_C3N.00491 | n_C3N.00242 | n_C3N.00390 | n_C3L.00096 | n_C3L.00902 | n_C3N.00168 | n_C3L.00097 | n_C3N.00495 | ⋯ | t_C3N.00310.3 | t_C3N.00177.1 | t_C3N.00733.2 | t_C3L.01281 | t_C3N.00494.2 | t_C3N.00150.3 | t_C3L.00583.2 | t_C3N.01524.2 | t_C3L.00790 | t_C3N.00177.2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
| hsa-let-7a-2-3p | 1.356483 | 0.4957318 | -0.6293892 | -1.314967 | 0.7913207 | -0.2275072 | 0.2415203 | -0.7071888 | 0.9209041 | -0.1653171 | ⋯ | 1.951625 | -1.314967 | 0.2363172 | -1.314967 | 0.5985561 | -1.314967 | -0.05679405 | 0.6845321 | -0.7463363 | -0.8479218 |
| n_C3L.00004 | n_C3L.00010 | n_C3L.00011 | n_C3L.00026 | n_C3L.00079 | n_C3L.00088 | n_C3L.00096 | n_C3L.00097 | n_C3L.00103 | n_C3L.00360 | ⋯ | t_C3N.01220 | t_C3N.01261 | t_C3N.01361 | t_C3N.01522 | t_C3N.01524 | t_C3N.01646 | t_C3N.01648 | t_C3N.01649 | t_C3N.01651 | t_C3N.01808 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
| 5_8S_rRNA | 1.176939 | -0.2798527 | 0.9838293 | 1.048199 | 1.281963 | 1.210818 | 1.244697 | 1.112569 | 1.285351 | 1.044811 | ⋯ | -1.773911 | -1.648559 | -0.510229 | -1.475777 | -1.523207 | -0.5915383 | -0.2324223 | 0.5637313 | -0.4966775 | -0.7168902 |
| n_C3L.01287 | n_C3L.00561 | n_C3N.01524 | n_C3L.01603 | n_C3N.00834 | n_C3N.01214 | n_C3N.01261 | n_C3L.00917 | n_C3L.00607 | n_C3N.00194 | ⋯ | t_C3N.00305 | t_C3N.00315 | t_C3N.00317 | t_C3N.00491 | t_C3N.00314 | t_C3N.00380 | t_C3N.00312 | t_C3N.00494 | t_C3N.00390 | t_C3N.00310 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
| TSPAN6 | 0.9652532 | 0.1228923 | 0.5451083 | 0.6683124 | 0.2736633 | -0.01535197 | -0.4085153 | 0.1415862 | -0.2280995 | 0.531451 | ⋯ | -0.8844957 | -0.9172049 | 1.253981 | -0.3026219 | 0.01548283 | -1.225976 | 0.3858165 | -2.338225 | -2.508108 | 0.4336097 |
mRNA = as.matrix(mRNA)
miRNA = as.matrix(miRNA)
proteo = as.matrix(proteo)
colnames_miRNA <- colnames(miRNA)
cleaned_colnames_miRNA <- lapply(colnames_miRNA, function(x) gsub("\\.x|\\.y", "", x))
cleaned_colnames_miRNA <- lapply(cleaned_colnames_miRNA, function(x) gsub("\\.", "-", x))
colnames(miRNA) = cleaned_colnames_miRNA
colnames_proteo <- colnames(proteo)
cleaned_colnames_proteo <- lapply(colnames_proteo, function(x) gsub("\\.x|\\.y", "", x))
cleaned_colnames_proteo <- lapply(cleaned_colnames_proteo, function(x) gsub("\\.", "-", x))
colnames(proteo) = cleaned_colnames_proteo
colnames_mRNA <- colnames(mRNA)
cleaned_colnames_mRNA <- lapply(colnames_mRNA, function(x) gsub("\\.x|\\.y", "", x))
cleaned_colnames_mRNA <- lapply(cleaned_colnames_mRNA, function(x) gsub("\\.", "-", x))
colnames(mRNA) = cleaned_colnames_mRNA
common_samples = Reduce(intersect, list(colnames(miRNA), colnames(proteo), colnames(mRNA), metadata$case_id))
# Subset each dataset by common samples
miRNA = miRNA[, common_samples]
proteo = proteo[, common_samples]
#phospho = phospho[, common_samples]
mRNA = mRNA[, common_samples]
# Create a list of your omics layers
multiomics_data = list(
mRNA = mRNA,
miRNA = miRNA,
#phospho = phospho,
proteo = proteo
)
#check dimensionalities
lapply(multiomics_data, dim)
# loading metadata
metadata = read.csv("/content/combined_metdata.csv", row.names =1)
#metadata = metadata[-1, ]
head(metadata)
| case_id | Age | Sex | Tumor_Size_cm | Histologic_Grade | Tumor_necrosis | Path_Stage_pT | Path_Stage_pN | Stage | BMI | Tobacco_smoking_history | KDM5C_mutation | VHL_mutation | BAP1_mutation | PBRM1_mutation | SETD2_mutation | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <int> | <chr> | <dbl> | <chr> | <chr> | <chr> | <chr> | <chr> | <dbl> | <chr> | <int> | <int> | <int> | <int> | <int> | <chr> | |
| 2 | t_C3L-00908 | 60 | Female | 10.5 | G3 Poorly differentiated | Not identified | pT2 | NA | Stage II | 27.85 | past smoker | 0 | 1 | 1 | 1 | 0 | tumor |
| 3 | t_C3L-00004 | 72 | Male | 12.0 | G3 Poorly differentiated | Present | pT3 | NA | Stage III | 22.80 | past smoker | 0 | 1 | 0 | 1 | 1 | tumor |
| 4 | t_C3L-00010 | 30 | Male | 6.5 | G3 Poorly differentiated | Not identified | pT1 | pN0 | Stage I | 34.15 | current smoker | 1 | 1 | 0 | 0 | 0 | tumor |
| 5 | t_C3L-00011 | 63 | Female | 12.0 | G4 Undifferentiated | Present | pT3 | NA | Stage IV | 27.47 | non-smoker | 1 | 1 | 1 | 0 | 0 | tumor |
| 6 | t_C3L-00026 | 65 | Female | 2.0 | G3 Poorly differentiated | Not identified | pT1 | NA | Stage I | 28.23 | non-smoker | 0 | 1 | 0 | 0 | 0 | tumor |
| 7 | t_C3L-00079 | 49 | Male | 14.0 | G3 Poorly differentiated | Present | pT3 | pN1 | Stage III | 37.88 | non-smoker | 1 | 1 | 0 | 1 | 0 | tumor |
#Create the MOFA obejct and train the model
MOFAobject <- create_mofa(multiomics_data)
MOFAobject
Creating MOFA object from a list of matrices (features as rows, sample as columns)...
Warning message in .rename_duplicated_features(object):
“There are duplicated features names across different views. We will add the suffix *_view* only for those features
Example: if you have both TP53 in mRNA and mutation data it will be renamed to TP53_mRNA, TP53_mutation”
Untrained MOFA model with the following characteristics: Number of views: 3 Views names: mRNA miRNA proteo Number of features (per view): 36351 2059 9025 Number of groups: 1 Groups names: group1 Number of samples (per group): 175
#Plot data overview
#Visualise the number of views (rows) and the number of groups (columns) exist, what are their corresponding dimensionalities and how many missing information they have (grey bars).
plot_data_overview(MOFAobject)
data_opts = get_default_data_options(MOFAobject)
data_opts
model_opts = get_default_model_options(MOFAobject)
model_opts
train_opts = get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "slow"
train_opts$seed = 42
train_opts
#Prepare the MOFA object
MOFAobject <- prepare_mofa(MOFAobject,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
Warning message in prepare_mofa(MOFAobject, data_options = data_opts, model_options = model_opts, : “Some view(s) have a lot of features, it is recommended to perform a more stringent feature selection before creating the MOFA object....” Checking data options... Checking training options... Checking model options...
#training the model
MOFAobject=run_mofa(MOFAobject)
Warning message in run_mofa(MOFAobject):
“No output filename provided. Using /tmp/RtmpyEug4s/mofa_20240917-112459.hdf5 to store the trained model.
”
Connecting to the mofapy2 python package using reticulate (use_basilisk = FALSE)...
Please make sure to manually specify the right python binary when loading R with reticulate::use_python(..., force=TRUE) or the right conda environment with reticulate::use_condaenv(..., force=TRUE)
If you prefer to let us automatically install a conda environment with 'mofapy2' installed using the 'basilisk' package, please use the argument 'use_basilisk = TRUE'
Warning message in run_mofa(MOFAobject):
“The latest mofapy2 version is 0.7.0, you are using 0.7.1. Please upgrade with 'pip install mofapy2'”
save(MOFAobject, file = "/content/drive/MyDrive/last_processing/last_MOFAobject.RData")
load("/content/drive/MyDrive/last_processing/last_MOFAobject.RData")
slotNames(MOFAobject)
names(MOFAobject@data)
names(MOFAobject@data)
dim(MOFAobject@data$miRNA$group1)
#Factor and Weight values (expectations of the posterior distributions):
names(MOFAobject@expectations)
# Dimensionality of the factor matrix: 200 samples, 15 factors
dim(MOFAobject@expectations$Z$group1)
2.Add sample metadata to the model
common_samples <- intersect(metadata$case_id, unlist(samples_names(MOFAobject)))
length(common_samples)
stopifnot(all(sort(common_samples)==sort(unlist(samples_names(MOFAobject)))))
metadata2 = metadata[metadata$case_id %in% common_samples, ]
metadata2$sample = metadata2$case_id
# Sanity check
stopifnot(all(sort(common_samples)==sort(unlist(samples_names(MOFAobject)))))
# Add sample metadata to the model
samples_metadata(MOFAobject) = metadata2
3.Correlation between factors
plot_factor_cor(MOFAobject)
plot variance Decomposition
plot_variance_explained(MOFAobject, x="view", y="factor", max_r2=20,plot_total = T)[[1]]
Total variance explained per view
plot_variance_explained(MOFAobject, plot_total = T)[[2]]
#install.packages("psych")
#library(psych)
correlate_factors_with_covariates(MOFAobject, #covariates represent metadata
covariates = c("Age", "status", "BMI"),
plot="log_pval"
)
Warning message in correlate_factors_with_covariates(MOFAobject, covariates = c("Age", :
“There are non-numeric values in the covariates data.frame, converting to numeric...”
factors = MOFAobject@expectations$Z$group1
factor_df = as.data.frame(factors)
case_id = rownames(factor_df)
factor_df = cbind(case_id, factor_df)
index_id = match(metadata2$case_id, factor_df$case_id)
rownames(factor_df) = metadata2$case_id [index_id]
factor_df = merge(factor_df, metadata2, by.x = "case_id", by.y = "case_id", all.x = T)
#factor_df = factor_df[, -c(25:31)]
#factor_df = factor_df[, -c(18:24)]
#factor_df = factor_df[, -c(1, 17, 19)]
#rownames(factor_df) = case_id
factor_df %>%
ggplot(aes(y = Factor1, x = Factor2, color = status)) +
geom_point(size = 4) +
theme_classic() +
labs(x = "Factor2", y = "Factor1")
factor_df %>%
ggplot(aes(y = Factor1, x = Factor3, color = status)) +
geom_point(size = 4) +
theme_classic() +
labs(x = "Factor3", y = "Factor1")
#library(cowplot, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
options(repr.plot.width = 15, repr.plot.height = 10)
p1 = factor_df %>%
ggplot(aes(x = status, y = Factor1, color = status, fill = status)) +
geom_boxplot() +
theme_classic() +
labs(x = "status", y = "Factor1")
p2 = factor_df %>%
ggplot(aes(x = status, y = Factor2, color = status, fill = status)) +
geom_boxplot() +
theme_classic() +
labs(x = "status", y = "Factor2")
p3 = factor_df %>%
ggplot(aes(x = status, y = Factor3, color = status, fill = status)) +
geom_boxplot() +
theme_classic() +
labs(x = "status", y = "Factor3")
plot_grid(p1, p2, p3, labels = "AUTO")
options(repr.plot.width = 15, repr.plot.height =10)
plot_factor(MOFAobject,
factors = 1:3,
color_by = "status"
)
p = plot_factor(MOFAobject,
factors = c(1,2,3),
color_by = metadata2$Status,
dot_size = 3,
dodge = T,
legend = F,
add_violin = T,
violin_alpha = 0.25
)
p = p +
scale_color_manual(values = c(A = "black", B ="red")) +
scale_fill_manual(values = c(A = "black", B ="red"))
print(p)
Scale for colour is already present. Adding another scale for colour, which will replace the existing scale. Scale for fill is already present. Adding another scale for fill, which will replace the existing scale. Warning message: “No shared levels found between `names(values)` of the manual scale and the data's fill values.” Warning message: “No shared levels found between `names(values)` of the manual scale and the data's fill values.”
3.Plot feature weights
3.1. Plot feature weights for mRNA
plot_weights(MOFAobject,
view = "mRNA",
factor = 1,
nfeatures = 10, # Top number of features to highlight
scale = T # Scale weights from -1 to 1
)
plot_top_weights(MOFAobject,
view = "mRNA",
factor = 1,
nfeatures = 20, # Top number of features to highlight
scale = T # Scale weights from -1 to 1
)
plot_factor(MOFAobject,
factors = 1,
color_by = "CLDN8",
add_violin = TRUE,
dodge = TRUE
)
options(repr.plot.width =7)
plot_factor(MOFAobject,
factors = 1,
color_by = "TMSB10_mRNA",
add_violin = TRUE,
dodge = TRUE
)
3.3.Plot molecular signatures in the input data
options(repr.plot.height = 8)
library(ggpubr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
plot_data_scatter(MOFAobject,
view = "mRNA",
factor = 1,
features = 4,
sign = "positive",
color_by = "status"
) + labs(y="mRNA expression")
options(repr.plot.height = 8)
plot_data_scatter(MOFAobject,
view = "mRNA",
factor = 1,
features = 4,
sign = "negative",
color_by = "status"
) + labs(y="mRNA expression")
options(repr.plot.width = 10, repr.plot.height = 12)
#An alternative visualisation is to use a heatmap
plot_data_heatmap(MOFAobject,
view = "mRNA",
factor = 1,
features = 30,
cluster_rows = FALSE, cluster_cols = FALSE,
show_rownames = TRUE, show_colnames = FALSE,
scale = "row"
)
weights = get_weights(MOFAobject, as.data.frame = T)
head(weights, 2)
| feature | factor | value | view | |
|---|---|---|---|---|
| <fct> | <fct> | <dbl> | <fct> | |
| 1 | 5_8S_rRNA | Factor1 | 0.39889929 | mRNA |
| 2 | 5S_rRNA | Factor1 | 0.06642034 | mRNA |
mRNA_factors = weights %>%
filter(view == "mRNA" & factor == "Factor1")
miRNA_factors = weights %>%
filter(view == "miRNA" & factor == "Factor1")
prot_factors = weights %>%
filter(view == "proteo" & factor == "Factor1")
mrna_list = mRNA_factors$feature
mirna_list = miRNA_factors$feature
prot_list = prot_factors$feature
#BiocManager::install("msigdbr", lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
library(msigdbr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
#BiocManager::install("fgsea", lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
library(fgsea, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
genesets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:KEGG")
genesets$gene_symbol = toupper(genesets$gene_symbol)
genesets_mrna = genesets[genesets$gene_symbol %in% mrna_list, ]
head(genesets_mrna)
| gs_cat | gs_subcat | gs_name | gene_symbol | entrez_gene | ensembl_gene | human_gene_symbol | human_entrez_gene | human_ensembl_gene | gs_id | gs_pmid | gs_geoid | gs_exact_source | gs_url | gs_description |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <chr> | <chr> | <int> | <chr> | <chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
| C2 | CP:KEGG | KEGG_ABC_TRANSPORTERS | ABCA10 | 10349 | ENSG00000154263 | ABCA10 | 10349 | ENSG00000154263 | M11911 | hsa02010 | http://www.genome.jp/kegg/pathway/hsa/hsa02010.html | ABC transporters | ||
| C2 | CP:KEGG | KEGG_ABC_TRANSPORTERS | ABCA12 | 26154 | ENSG00000144452 | ABCA12 | 26154 | ENSG00000144452 | M11911 | hsa02010 | http://www.genome.jp/kegg/pathway/hsa/hsa02010.html | ABC transporters | ||
| C2 | CP:KEGG | KEGG_ABC_TRANSPORTERS | ABCA13 | 154664 | ENSG00000179869 | ABCA13 | 154664 | ENSG00000179869 | M11911 | hsa02010 | http://www.genome.jp/kegg/pathway/hsa/hsa02010.html | ABC transporters | ||
| C2 | CP:KEGG | KEGG_ABC_TRANSPORTERS | ABCA3 | 21 | ENSG00000167972 | ABCA3 | 21 | ENSG00000167972 | M11911 | hsa02010 | http://www.genome.jp/kegg/pathway/hsa/hsa02010.html | ABC transporters | ||
| C2 | CP:KEGG | KEGG_ABC_TRANSPORTERS | ABCA4 | 24 | ENSG00000198691 | ABCA4 | 24 | ENSG00000198691 | M11911 | hsa02010 | http://www.genome.jp/kegg/pathway/hsa/hsa02010.html | ABC transporters | ||
| C2 | CP:KEGG | KEGG_ABC_TRANSPORTERS | ABCA5 | 23461 | ENSG00000154265 | ABCA5 | 23461 | ENSG00000154265 | M11911 | hsa02010 | http://www.genome.jp/kegg/pathway/hsa/hsa02010.html | ABC transporters |
mrna
my_list_mrna = genesets %>%
split(x = .$gene_symbol, f = .$gs_name)
head(my_list_mrna)
rownames(mRNA_factors) = mRNA_factors[,1]
#mRNA_factors = mRNA_factors[, -1]
head(mRNA_factors)
| factor | value | view | |
|---|---|---|---|
| <fct> | <dbl> | <fct> | |
| 5_8S_rRNA | Factor1 | 0.39889929 | mRNA |
| 5S_rRNA | Factor1 | 0.06642034 | mRNA |
| 7SK | Factor1 | 0.03052169 | mRNA |
| A1BG_mRNA | Factor1 | -0.04404991 | mRNA |
| A1BG-AS1 | Factor1 | -0.15101285 | mRNA |
| A1CF_mRNA | Factor1 | 0.17880762 | mRNA |
# Extract the stats for fgsea
stats_mrna = mRNA_factors[, 2]
names(stats_mrna) = rownames(mRNA_factors) # Set the row names as the names of stats_mrna
# Run fgsea
eaRes_rna = fgsea(pathways = my_list_mrna, stats = stats_mrna, nperm = 100)
Warning message in fgsea(pathways = my_list_mrna, stats = stats_mrna, nperm = 100): “You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.” Warning message in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : “There are ties in the preranked stats (2.43% of the list). The order of those tied genes will be arbitrary, which may produce unexpected results.”
head(eaRes_rna)
| pathway | pval | padj | ES | NES | nMoreExtreme | size | leadingEdge |
|---|---|---|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | <list> |
| KEGG_ABC_TRANSPORTERS | 0.14516129 | 0.3799883 | 0.4645781 | 1.3902022 | 8 | 24 | ABCA10, .... |
| KEGG_ACUTE_MYELOID_LEUKEMIA | 0.14583333 | 0.3799883 | -0.4263980 | -1.3025918 | 6 | 16 | CEBPA, P.... |
| KEGG_ADHERENS_JUNCTION | 0.15254237 | 0.3865800 | 0.5169783 | 1.4085838 | 8 | 15 | RAC3, NE.... |
| KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY | 0.43103448 | 0.7104692 | 0.3658581 | 1.0301545 | 24 | 19 | PPARA, A.... |
| KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM | 0.64814815 | 0.8426907 | 0.5477810 | 0.8677122 | 34 | 3 | GAD1, GLS2 |
| KEGG_ALDOSTERONE_REGULATED_SODIUM_REABSORPTION | 0.01754386 | 0.1303735 | 0.6317237 | 1.6950953 | 0 | 17 | KCNJ1, S.... |
ooEA_rna = order(eaRes_rna$pval, decreasing = FALSE)
eaRes_rna = eaRes_rna[ooEA_rna, ]
print(head(eaRes_rna))
pathway pval padj
<char> <num> <num>
1: KEGG_REGULATION_OF_ACTIN_CYTOSKELETON 0.01492537 0.1303735
2: KEGG_TIGHT_JUNCTION 0.01492537 0.1303735
3: KEGG_RETINOL_METABOLISM 0.01515152 0.1303735
4: KEGG_STARCH_AND_SUCROSE_METABOLISM 0.01538462 0.1303735
5: KEGG_DRUG_METABOLISM_CYTOCHROME_P450 0.01587302 0.1303735
6: KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 0.01587302 0.1303735
ES NES nMoreExtreme size leadingEdge
<num> <num> <num> <int> <list>
1: 0.4447024 1.699111 0 73 FGF9, PF....
2: 0.4945598 1.733880 0 49 CLDN8, C....
3: 0.5081981 1.704826 0 41 CYP2B6, ....
4: 0.6337164 1.899308 0 22 ENPP1, U....
5: 0.5778064 1.909978 0 38 CYP2B6, ....
6: 0.6111264 1.998732 0 36 CYP2B6, ....
topPathwaysUP_rna = eaRes_rna[ES>0 & pval<0.05][head(order(pval), n=10), pathway]
topPathwaysDown_rna = eaRes_rna[ES<0 & pval<0.05][head(order(pval), n=10), pathway]
topPathways_rna = c(topPathwaysUP_rna, rev(topPathwaysDown_rna))
plotGseaTable(my_list_mrna[topPathways_rna], stats_mrna, eaRes_rna,
gseaParam = 0.5)
plotEnrichment(my_list_mrna[["KEGG_REGULATION_OF_ACTIN_CYTOSKELETON"]],
stats_mrna) + labs(title = "")
# select the necessary columns
gsea_data = eaRes_rna[, c("pathway", "NES", "padj")]
# convert the FDR q-value to a logarithmic scale
gsea_data$padj = -log10(gsea_data$padj)
#create a new column to indicate the direction of enrichment
gsea_data$direction = ifelse(gsea_data$NES > 0, "UP", "DOWN")
options(repr.plot.width = 20, repr.plot.height = 15)
#creating volcano plot
ggplot(gsea_data, aes(x = NES, y = padj, color = direction, size = abs(NES))) +
geom_point() +
theme_classic() +
geom_text(aes(label = eaRes_rna$pathway), check_overlap = TRUE, vjust = 3, size = 3) +
labs(x = "Normalized Enrichment Score (NES)",
y = "-log10(FDR q-value)",
size = "NES Absolute Value") +
scale_color_manual(values = c("UP" = "red", "DOWN" = "blue")) +
theme(legend.position = "bottom")
# Create a logical vector indicating which rows to combine
rows_to_combine = eaRes_rna$pathway %in% topPathways_rna
#Combine the matching rows into a single list
combined_list = eaRes_rna$leadingEdge[rows_to_combine]
combined_list
genes_to_search = unlist(combined_list)
mrna_filt = mRNA[genes_to_search, ]
# Ensure metadata and mrna_filt have the same samples
common_samples <- intersect(colnames(mrna_filt), metadata2$case_id) # Adjust "sample_id" to the correct column name
# Filter metadata and matrix to contain only the common samples
metadata2_filtered <- metadata2[metadata2$case_id %in% common_samples, ]
mrna_filt_filtered <- mrna_filt[, common_samples]
#create hatmap for the top 10 pathways in mRNA
#library(ComplexHeatmap, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
set.seed(123)
mycolors = rep("yellow", dim(metadata2_filtered)[1])
mycolors[which(metadata2_filtered=="tumor")] = "blue"
column_ha = HeatmapAnnotation(sample_type = metadata2_filtered$status)
Heatmap(mrna_filt_filtered, name = "top", top_annotation = column_ha)
The automatically generated colors map from the minus and plus 99^th of the absolute values in the matrix. There are outliers in the matrix whose patterns might be hidden by this color mapping. You can manually set the color to `col` argument. Use `suppressMessages()` to turn off this message.
proteomics
Plot feature weights for proteomics
plot_weights(MOFAobject,
view = "proteo",
factor = 1,
nfeatures = 20, # Top number of features to highlight
scale = T # Scale weights from -1 to 1
)
plot_top_weights(MOFAobject,
view = "proteo",
factor = 1,
nfeatures = 20, # Top number of features to highlight
scale = T # Scale weights from -1 to 1
)
plot_factor(MOFAobject,
factors = 1,
color_by = "NDUFS4_proteo",
add_violin = TRUE,
dodge = TRUE
)
plot_factor(MOFAobject,
factors = 1,
color_by = "TMSB10_proteo",
add_violin = TRUE,
dodge = TRUE
)
Plot molecular signatures in the input data
library(ggpubr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
plot_data_scatter(MOFAobject,
view = "proteo",
factor = 1,
features = 4,
sign = "positive",
color_by = "status"
) + labs(y="proteo expression")
dim(MOFAobject@data$proteo$group1)
library(ggpubr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
plot_data_scatter(MOFAobject,
view = "proteo",
factor = 1,
features = 4,
sign = "negative",
color_by = "status"
) + labs(y="proteo expression")
GENE SET ENRICHMENT ANALYSIS
library( msigdbr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
genesets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:KEGG")
genesets$gene_symbol = toupper(genesets$gene_symbol)
genesets_prot = genesets[genesets$gene_symbol %in% prot_list, ]
genesets_prot
| gs_cat | gs_subcat | gs_name | gene_symbol | entrez_gene | ensembl_gene | human_gene_symbol | human_entrez_gene | human_ensembl_gene | gs_id | gs_pmid | gs_geoid | gs_exact_source | gs_url | gs_description |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <chr> | <chr> | <int> | <chr> | <chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
| C2 | CP:KEGG | KEGG_OLFACTORY_TRANSDUCTION | OR4C16 | 219428 | ENSG00000279514 | OR4C16 | 219428 | ENSG00000279514 | M14091 | hsa04740 | http://www.genome.jp/kegg/pathway/hsa/hsa04740.html | Olfactory transduction |
my_list_prot = genesets %>%
split(x = .$gene_symbol, f = .$gs_name)
head(my_list_prot)
head(prot_factors)
| feature | factor | value | view | |
|---|---|---|---|---|
| <fct> | <fct> | <dbl> | <fct> | |
| 1 | TSPAN6_proteo | Factor1 | 0.212688759 | proteo |
| 2 | DPM1_proteo | Factor1 | 0.210497901 | proteo |
| 3 | SCYL3_proteo | Factor1 | -0.149259895 | proteo |
| 4 | FGR_proteo | Factor1 | -0.262913644 | proteo |
| 5 | CFH_proteo | Factor1 | -0.001801545 | proteo |
| 6 | FUCA2_proteo | Factor1 | 0.307839185 | proteo |
rownames(prot_factors) = prot_factors[,1]
prot_factors = prot_factors[, -1]
# Extract the stats for fgsea
stats_prot = prot_factors[, 2]
names(stats_prot) = gsub("_proto", "", rownames(prot_factors))
my_list = lapply(my_list_prot, function(x) gsub("_proteo", "", x))
# Run fgsea
eaRes_prot = fgsea(pathways = my_list_prot, stats = stats_prot)
Warning message in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : “There are ties in the preranked stats (0.03% of the list). The order of those tied genes will be arbitrary, which may produce unexpected results.”
eaRes_prot
| pathway | pval | padj | log2err | ES | NES | size | leadingEdge |
|---|---|---|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | <list> |
| KEGG_OLFACTORY_TRANSDUCTION | 0.1527778 | 0.1527778 | 0.1585141 | 0.9195479 | 1.237797 | 1 | OR4C16 |
#####################NOTE: it is only one pathway with insignificant adjpavl
plotEnrichment(my_list_prot[["KEGG_OLFACTORY_TRANSDUCTION"]],
stats_prot) + labs(title = "")
miRNA
Plot feature weights
plot_weights(MOFAobject,
view = "miRNA",
factor = 1,
nfeatures = 10, # Top number of features to highlight
scale = T # Scale weights from -1 to 1
)
plot_top_weights(MOFAobject,
view = "miRNA",
factor = 1,
nfeatures = 20, # Top number of features to highlight
scale = T # Scale weights from -1 to 1
)
plot_factor(MOFAobject,
factors = 1,
color_by = "hsa-miR-888-5p",
add_violin = TRUE,
dodge = TRUE
)
plot_factor(MOFAobject,
factors = 1,
color_by = "hsa-miR-122-5p",
add_violin = TRUE,
dodge = TRUE
)
library(ggpubr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
plot_data_scatter(MOFAobject,
view = "miRNA",
factor = 1,
features = 4,
sign = "positive",
color_by = "status"
) + labs(y="miRNA expression")
library(ggpubr, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
plot_data_scatter(MOFAobject,
view = "miRNA",
factor = 1,
features = 4,
sign = "positive",
color_by = "status"
) + labs(y="mRNA expression")
options(repr.plot.width = 10, repr.plot.height = 12)
#An alternative visualisation is to use a heatmap
plot_data_heatmap(MOFAobject,
view = "miRNA",
factor = 1,
features = 30,
cluster_rows = FALSE, cluster_cols = FALSE,
show_rownames = TRUE, show_colnames = FALSE,
scale = "row"
)
miRNA set Enrichment Analysis
# Load libraries
library(fgsea, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
library(org.Hs.eg.db, lib = "/content/drive/MyDrive/Colab Notebooks/R_lib")
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:lubridate’:
intersect, setdiff, union
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
tapply, union, unique, unsplit, which.max, which.min
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: IRanges
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:lubridate’:
second, second<-
The following objects are masked from ‘package:dplyr’:
first, rename
The following object is masked from ‘package:tidyr’:
expand
The following objects are masked from ‘package:data.table’:
first, second
The following object is masked from ‘package:utils’:
findMatches
The following objects are masked from ‘package:base’:
expand.grid, I, unname
Attaching package: ‘IRanges’
The following object is masked from ‘package:lubridate’:
%within%
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
The following object is masked from ‘package:purrr’:
reduce
The following object is masked from ‘package:data.table’:
shift
Attaching package: ‘AnnotationDbi’
The following object is masked from ‘package:dplyr’:
select
my_list = genesets %>%
split(x = .$gene_symbol, f = .$gs_name)
head(my_list)
head(mrna_list)
#mirna_stats = mRNA_factors[,3]
names(mirna_stats) = miRNA_factors$feature
head(mirna_stats)
miRNA_factors = miRNA_factors[order(miRNA_factors$value), ]
mir_ids = miRNA_factors[1:100, ]
mir_ids = mir_ids$feature
write.table(mir_ids, "/content/mirna_ids.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
# Run fgsea
eaRes = fgsea(pathways = my_list, stats = stats, nperm = 100)
Warning message in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : “There are ties in the preranked stats (2.43% of the list). The order of those tied genes will be arbitrary, which may produce unexpected results.” Warning message in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : “There are duplicate gene names, fgsea may produce unexpected results.”